## *************************************
##        A1 Summary Statistics       
##         Hao Zhang & Ye Zhang
##             2025/7/24
## *************************************

library(readstata13)
library(stargazer)
library(tidyverse)

# load data
load("../analysis data/firm analysis.RData")
setwd("../raw data")
mayor <- read.dta13("city mayor 1990-2015.dta")
mayor <- subset(mayor, year >= 2001 & year <= 2007)
secretary <- read.dta13("city secretary 1990-2015.dta")
secretary <- subset(secretary, year >= 2001 & year <= 2007)
city <- read.dta13("city panel 1995-2010.dta")
city <- subset(city, year >= 2001 & year <= 2007)

# run inverse hyperbolic sine
ihs <- function(x) {
  y <- log(x + sqrt(x ^ 2 + 1))
  return(y)
}

# firm summary statistics
firm$log_employment <- log(firm$employment)
firm$log_output1 <- log(firm$output1 + 1)
firm <- firm %>% filter(wage > 0)
firm$log_wage <- log(firm$wage)
firm$log_profit <- ihs(firm$profit)
firm <- firm %>% select(insurance_dummy, medical_dummy, rate1, rate2, log_employment, log_output1, log_wage, log_profit, fie, soe)
stargazer(firm, omit.summary.stat = c("median", "p25", "p75")) 

# mayor summary statistics (see line #88-90 for political connection)
mayor <- mayor %>% select(begin, seq, ethnicity, educ, gender)
stargazer(mayor, omit.summary.stat = c("median", "p25", "p75"))
length(mayor$ethnicity[mayor$ethnicity != ""]) # manual check to find 16 distinct values
length(mayor$educ[mayor$educ != ""]) # manual check to find 6 distinct values
length(mayor$gender[mayor$gender != ""]) # manual check to find 2 distinct values

# secretary summary statistics
secretary <- secretary %>% select(begin_sc, ethnicity_sc, educ_sc, gender_sc)
stargazer(secretary, omit.summary.stat = c("median", "p25", "p75"))
length(secretary$ethnicity_sc[secretary$ethnicity_sc != ""]) # manual check to find 14 distinct values
length(secretary$educ_sc[secretary$educ != ""]) # manual check to find 15 distinct values
length(secretary$gender_sc[secretary$gender_sc != ""]) # manual check to find 2 distinct values

# city summary statistics
city <- city %>% filter(substr(as.character(city$citycode), 3, 4) != "01")
city1 <- city %>% select(citycode, year, gdp) %>% mutate(year = year + 1)
city <- left_join(city, city1, by = c("citycode", "year"))
city$gdpgrowth <- city$gdp.x/city$gdp.y - 1
city <- city %>% mutate(province = as.integer(citycode/100)) %>% 
  filter(province != 11, province != 12, province != 31, citycode != 5102) %>%
  group_by(province, year) %>%
  mutate(gdpgrowth_mean = mean(gdpgrowth, na.rm = TRUE)) %>% 
  mutate(year = year + 1) %>% filter(gdpgrowth >= -0.2, gdpgrowth <= 0.6) %>%
  mutate(gdp_pressure = gdpgrowth_mean - gdpgrowth) %>%
  rename(gdp = gdp.x) %>% ungroup()

# add tax collection, unemployment rate, and medical resources
city1 <- readxl::read_xlsx("city panel 1996-2014.xlsx") # warnings due to conversion from text to number for the year variable for districts and provinces
citycode <- readxl::read_xlsx("citycode.xlsx") %>%
  mutate(citycode = as.integer(citycode/100))
city1 <- left_join(city1, citycode, by = "city", relationship = "many-to-many") %>% distinct() %>%
  filter(year >= 1999 & year <= 2007)
city <- left_join(city, city1, by = c("citycode", "year"))

city <- city %>% mutate(capacity = (tax_revenue*100000 - profits)/gdp/10, 
                        deficit = expenditure - revenue_city, 
                        unemp_rate = unemployment/population)

# label collection authority changes
city <- city %>% mutate(province = as.integer(citycode/100)) %>% 
  filter(year >= 2001) %>%
  mutate(tax = ifelse(province == 33 & year >= 2004 | province == 13 & year >= 2002 | 
                        province == 42 & year >= 2002 | province == 44 & year >= 2002 | 
                        province == 32 & year >= 2005 | province == 53 & year >= 2006, 1, 0))


city$log_FDI <- log(city$FDI + 1)
city$log_gdppc <- log(city$gdp/city$population)
city$log_deficit <- ihs(city$deficit/city$gdp)
city$log_capacity <- ihs(city$capacity)
connection <- city %>% select(connection) %>% as.data.frame()
stargazer(connection, omit.summary.stat = c("median", "p25", "p75"))
city <- city %>% select(gdp_pressure, gdpgrowth, log_gdppc, log_deficit, log_capacity, tax, log_FDI, connection) %>% as.data.frame()
stargazer(city, omit.summary.stat = c("median", "p25", "p75"))